******************************************************************************************************************************************************
*This do file creates datasets for cross validation 
*	a.	Input files:
*		i.	GSO_CLEAN.dta
*		ii.	PCI_repeated_cross_CLEAN.dta
*		iii.crosswalk_province.csv
*		iv.	isic_alphabet_crosswalk.dta
*	b.	Output files:
*		i.	crossvalidation_2digit_allfirms
*		ii.	crossvalidation_2digit_exclude10												   																	    *
******************************************************************************************************************************************************

clear all
set more off
set mem 600m
set maxvar 3000

*SET DIRECTORY HERE*
global dir_data_original /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/original raw data/
global dir_data_coded /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/intermediary data/
global dir_data_analysis /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/analysis data/

****************Version 1: 2-digit code and excluding small firms*****************************

use "${dir_data_coded}/GSO_CLEAN.dta",clear

**KEEP ONLY PRIVATE DOMESTIC FIRMS
drop if lhdn<7 | lhdn>11
drop if isic_rev4_2digit==""

**KEEP ONLY FIRMS WITH MORE THAN 10 EMPLOYEES
local minsize = 10	
drop if employ<`minsize' & employ!=.

**Create per-firm rjt variables
bys isic_rev4_2digit province year: egen temp_med_revenue = median(revenue) 
bys isic_rev4_2digit province year: egen temp_med_revenue_main = median(revenue_main) 
bys isic_rev4_2digit province year: egen temp_med_employ = median(employ) 
bys isic_rev4_2digit province year: egen temp_med_assets = median(assets)

bys isic_rev4_2digit province year: egen temp_mean_revenue = mean(revenue) 
bys isic_rev4_2digit province year: egen temp_mean_revenue_main = mean(revenue_main) 
bys isic_rev4_2digit province year: egen temp_mean_employ = mean(employ) 
bys isic_rev4_2digit province year: egen temp_mean_assets = mean(assets) 

foreach v in revenue revenue_main employ assets {
	bys isic_rev4_2digit province year: egen med_`v' = min(temp_med_`v') 
	bys isic_rev4_2digit province year: egen mean_`v'=min(temp_mean_`v')
	g ln_med_`v'=ln(med_`v')
	g ln_mean_`v'=ln(mean_`v')
	}

collapse (mean) med_revenue ln_med_revenue med_revenue_main ln_med_revenue_main med_employ ln_med_employ med_assets ln_med_assets mean_revenue ln_mean_revenue mean_revenue_main ln_mean_revenue_main mean_assets ln_mean_assets mean_employ ln_mean_employ, by(isic_rev4_2digit province year)
sort province
tempfile GSO_collapsed
save `GSO_collapsed'

insheet using "${dir_data_original}/Crosswalks/crosswalk_province.csv", comma clear
sort province
merge 1:m province using `GSO_collapsed'
keep if _merge==3
drop _merge
drop if pci_id==.
drop if year==.
drop if isic_rev4_2digit==""
drop province
sort pci_id isic_rev4_2digit year
save `GSO_collapsed', replace

**READ IN PCI
use "${dir_data_coded}/PCI_repeated_cross_CLEAN.dta", clear

**HARMONIZE YEAR VAR
replace year=year-1

**KEEP FIRMS WITH MORE THAN 10 lagged EMPLOYEES 
g yrsopen = year+1 - a1
drop if a9_2==.b & a9_3==.b
keep if (a9_2>2 & a9_2!=.b) | (a9_2==.b & a9_3>2 & yrsopen>0 & yrsopen!=.)

tab a9_3, mi
rename a9_3 employ_pci

tab employ_pci

g employ_n=.

**USE EMPIRICAL MEAN FROM GSO FOR EACH EMPLOYMENT CATEGORY
replace employ_n = 3 if employ_pci == 1
replace employ_n = 6.5 if employ_pci == 2
replace employ_n = 19.3 if employ_pci == 3
replace employ_n = 91.5 if employ_pci == 4
replace employ_n = 240.8 if employ_pci == 5
replace employ_n = 377 if employ_pci == 6
replace employ_n = 683 if employ_pci == 7
replace employ_n = 2082.6 if employ_pci == 8

drop employ_pci
rename employ_n employ_pci

drop if isic_rev4_2digit==""
foreach v in employ_pci {
	bys isic_rev4_2digit pci_id year: egen temp_med_`v' = median(`v') 
	bys isic_rev4_2digit pci_id year: egen temp_mean_`v' = mean(`v')
	bys isic_rev4_2digit pci_id year: egen med_`v' = min(temp_med_`v') 
	bys isic_rev4_2digit pci_id year: egen mean_`v'=min(temp_mean_`v')
	g ln_med_`v'=ln(med_`v')
	g ln_mean_`v'=ln(mean_`v')
	}

drop if pci_id==.
drop if year==.
	
collapse (mean)  med_employ_pci mean_employ_pci ln_med_employ_pci ln_mean_employ_pci,by(isic_rev4_2digit pci_id year)
sort pci_id isic_rev4_2digit year
merge 1:1 pci_id isic_rev4_2digit year using `GSO_collapsed'
tab _merge
keep if _merge==3	

tab year if _merge==3 
drop _merge

sort isic_rev4_2digit
tostring isic_rev4_2digit,replace
merge m:1 isic_rev4_2digit using "${dir_data_original}/Crosswalks/isic_alphabet_crosswalk.dta"
keep if _merge==3
drop _merge

g t= year
g r= pci_id
g j= isic_rev4_2digit
egen rj=group(r j)
egen rb=group(r broad)

save "${dir_data_analysis}/crossvalidation_2digit_exclude10.dta", replace


****************Version 2: 2-digit code and all firms*****************************

use "${dir_data_coded}/GSO_CLEAN.dta",clear

**KEEP ONLY PRIVATE DOMESTIC FIRMS
drop if lhdn<7 | lhdn>11
drop if isic_rev4_2digit==""

**Create per-firm rjt variables
bys isic_rev4_2digit province year: egen temp_med_revenue = median(revenue) 
bys isic_rev4_2digit province year: egen temp_med_revenue_main = median(revenue_main) 
bys isic_rev4_2digit province year: egen temp_med_employ = median(employ) 
bys isic_rev4_2digit province year: egen temp_med_assets = median(assets)

bys isic_rev4_2digit province year: egen temp_mean_revenue = mean(revenue) 
bys isic_rev4_2digit province year: egen temp_mean_revenue_main = mean(revenue_main) 
bys isic_rev4_2digit province year: egen temp_mean_employ = mean(employ) 
bys isic_rev4_2digit province year: egen temp_mean_assets = mean(assets) 

foreach v in revenue revenue_main employ assets {
	bys isic_rev4_2digit province year: egen med_`v' = min(temp_med_`v') 
	bys isic_rev4_2digit province year: egen mean_`v'=min(temp_mean_`v')
	g ln_med_`v'=ln(med_`v')
	g ln_mean_`v'=ln(mean_`v')
	}

collapse (mean) med_revenue ln_med_revenue med_revenue_main ln_med_revenue_main med_employ ln_med_employ med_assets ln_med_assets mean_revenue ln_mean_revenue mean_revenue_main ln_mean_revenue_main mean_assets ln_mean_assets mean_employ ln_mean_employ, by(isic_rev4_2digit province year)
sort province
tempfile GSO_collapsed
save `GSO_collapsed'

insheet using "${dir_data_original}/Crosswalks/crosswalk_province.csv", comma clear
sort province
merge 1:m province using `GSO_collapsed'
keep if _merge==3
drop _merge
drop if pci_id==.
drop if year==.
drop if isic_rev4_2digit==""
drop province
sort pci_id isic_rev4_2digit year
save `GSO_collapsed', replace


**READ PCI
use "${dir_data_coded}/PCI_repeated_cross_CLEAN.dta", clear

**HARMONIZE YEAR VAR
replace year=year-1

tab a9_3, mi
rename a9_3 employ_pci

tab employ_pci
g employ_n=.
**USE EMPIRICAL MEAN FROM GSO FOR EACH EMPLOYMENT CATEGORY
replace employ_n = 3 if employ_pci == 1
replace employ_n = 6.5 if employ_pci == 2
replace employ_n = 19.3 if employ_pci == 3
replace employ_n = 91.5 if employ_pci == 4
replace employ_n = 240.8 if employ_pci == 5
replace employ_n = 377 if employ_pci == 6
replace employ_n = 683 if employ_pci == 7
replace employ_n = 2082.6 if employ_pci == 8

drop employ_pci
rename employ_n employ_pci

drop if isic_rev4_2digit==""
foreach v in employ_pci {
	bys isic_rev4_2digit pci_id year: egen temp_med_`v' = median(`v') 
	bys isic_rev4_2digit pci_id year: egen temp_mean_`v' = mean(`v')
	bys isic_rev4_2digit pci_id year: egen med_`v' = min(temp_med_`v') 
	bys isic_rev4_2digit pci_id year: egen mean_`v'=min(temp_mean_`v')
	g ln_med_`v'=ln(med_`v')
	g ln_mean_`v'=ln(mean_`v')
	}

drop if pci_id==.
drop if year==.
	
collapse (mean)  med_employ_pci mean_employ_pci ln_med_employ_pci ln_mean_employ_pci,by(isic_rev4_2digit pci_id year)
sort pci_id isic_rev4_2digit year
merge 1:1 pci_id isic_rev4_2digit year using `GSO_collapsed'
tab _merge
keep if _merge==3				

tab year if _merge==3 
drop _merge

sort isic_rev4_2digit
tostring isic_rev4_2digit,replace
merge m:1 isic_rev4_2digit using "${dir_data_original}/Crosswalks/isic_alphabet_crosswalk.dta"
keep if _merge==3 
drop _merge

g t= year
g r= pci_id
g j= isic_rev4_2digit
egen rj=group(r j)
egen rb=group(r broad)

save "${dir_data_analysis}/crossvalidation_2digit_allfirms.dta", replace


